# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************

# This dataset should be downloaded from
# The terms and conditions of the data set license apply.
# Intel does not grant any rights to the data files.
# The Amazon Review Dataset was published in the following papers:
# Ups and downs:
# Modeling the visual evolution of fashion trends with one-class collaborative filtering
# R. He, J. McAuley
# WWW, 2016
# Image-based recommendations on styles and substitutes
# J. McAuley, C. Targett, J. Shi, A. van den Hengel
# SIGIR, 2015

import pandas as pd
import json

from nlp_architect.utils.generic import normalize, balance

[docs]def review_to_sentiment(review): # Review is coming in as overall (the rating, reviewText, and summary) # this then cleans the summary and review and gives it a positive or negative value norm_text = normalize(review[2] + " " + review[1]) review_sent = ['neutral', norm_text] if review[0] > 3: review_sent = ['positive', norm_text] elif review[0] < 3: review_sent = ['negative', norm_text] return review_sent
[docs]class Amazon_Reviews(object): """ Take the *.json file of Amazon reviews as downloaded from Then does data cleaning and balancing, as well as transforms the reviews 1-5 to a sentiment """ def __init__(self, review_file, run_balance=True): self.run_balance = run_balance print("Parsing and processing json file") data = [] with open(review_file, 'r') as f: for line in f: data_line = json.loads(line) selected_row = [] for item in good_columns: selected_row.append(data_line[item]) # as we read in, clean data.append(review_to_sentiment(selected_row)) # Not sure how to easily balance outside of pandas...but should replace eventually = pd.DataFrame(data, columns=['Sentiment', 'clean_text']) self.all_text =['clean_text'] self.labels_0 = pd.get_dummies(['Sentiment']) self.labels = self.labels_0.values self.text =['clean_text'].values
[docs] def process(self): =[['Sentiment'].isin(['positive', 'negative'])] if self.run_balance: # balance it out = balance( print("Sample Data") print([['Sentiment', 'clean_text']].head()) # mapping of the labels with dummies (has headers) self.labels_0 = pd.get_dummies(['Sentiment']) self.labels = self.labels_0.values self.text =['clean_text'].values